** 2004 data seems to have many issues : Build a version of 2004 employment using the 2005 file (and 2005 industry codes)

use siren nbheur_1 s_brut_1 catjur catjur_1 apet apet_1 apen  comt comt_1 comr pcs pcs_1 filt filt_1 age duree  ${condDADS} using "${tmp}\dads2005", clear
	destring catjur*, replace
	bys siren (catjur_1): replace catjur_1 = catjur_1[1]
	keep if (floor(catjur_1/1000)==5) // keep private

	destring siren, replace force
	drop if mi(siren)
	replace apet_1 = apet if mi(apet_1)
	drop apen

	// Convert sectors
	rename apet_1 naf_rev1
	merge m:1 siren comt using "${data}/Utils/SirenCom_NAF_rev12_dads", nogen keep(1 3) keepusing(naf_rev2)
	merge m:1 siren using "${data}/Utils/Siren_NAF_rev12", nogen keep(1 3 4 5) update  keepusing(naf_rev2)
	merge m:1 naf_rev1 using "${data}/Utils/NAF_rev12_impute", nogen keep(1 3 4 5) update  keepusing(naf_rev2)
	drop naf_rev1
	ren naf_rev2 apet_1

	** Aggregate engineer empl. is much lower in the data in 2003 and 2004, try to correct for that
	replace pcs_1 = pcs if siren == 383475092 //error for Thales in 2004 data, only 3 different PCS, 0 engineer given the size of Thales we decided to correct this manually as it creates a clear gap in total R&D spendings

	gen hskill = substr(pcs_1,1,1)=="3"
	gen rdjob_eng = inlist(pcs_1,"383A","384A","385A","386A","388A")
	gen rdjob_tech = inlist(pcs_1,"473B","474B","475A","478A")
	gen job_eng = (substr(pcs_1,1,2) == "38")
	gen job_tech = (substr(pcs_1,1,2) == "47")
	gen job_hskill_noneng = (hskill==1 & job_tech==0 &  job_eng== 0 )
	gen other = (job_eng + job_tech + job_hskill_noneng == 0)


	foreach cat in job_eng job_tech job_hskill_noneng other rdjob_eng rdjob_tech {
		gen `cat'_hrs = nbheur_1 *(`cat' == 1)
		gen `cat'_sbrut = s_brut_1 *(`cat' == 1)
	}
		
	//allocate ZE
	gen depcom = comt_1
	replace depcom = comr if real(substr(depcom,1,2)) >= 98 | depcom == "69999" | regexm(depcom,"ZZZ")

	replace depcom = "75056" if substr(depcom,1,2) == "75"
	replace depcom = "13055" if inrange(real(depcom),13201,13216)
	replace depcom = "69123" if inrange(real(depcom),69381,69389)
	replace depcom = "59183" if depcom == "59540" | depcom == "59248"
	destring depcom, replace force
	drop if mi(depcom) // drops Corsica
	merge m:1 depcom using "${data}/Utils/depcom_ZE_mapping", nogen keep(1 3)
	//correct some inconsistencies
	drop if siren == 380129866 //Conversion France Telecom -> Orange, public status therefore absent before 2009
	replace apet_1 = "2630Z" if siren == 480570597  & ZE2010 == "0056" //large LG electronics plant suddenly changes industry
	replace apet_1 = "2651A" if siren == 383475092 //make sure Thales industry is right

	bys ZE2010 apet_1 siren: gen nb_siren = _n == 1 
	bys ZE2010 apet_1 siren: gen nb_siren_eng = (_n == 1) & job_eng > 0 & !mi(job_eng)

	bys ZE2010 apet_1: egen mean_duree_inge = sum(duree*job_eng)
	bys ZE2010 apet_1: egen foo = sum(job_eng)
	replace mean_duree_inge = mean_duree_inge / foo
	drop foo

	drop apet
	ren apet_1 apet

preserve
	gcollapse (sum) *_hrs *_sbrut nb_siren* (mean) mean_duree_inge, by(depcom apet)
	ren apet ape	
	gen year = 2004
	compress	
save "${tmp}/DADSpri_RD_com_NAF_2004", replace
restore

gcollapse (sum) *_hrs *_sbrut nb_siren* (mean) mean_duree_inge, by(ZE2010 apet)	
ren apet ape	
gen year = 2004
compress	
save "${tmp}/DADSpri_RD_ZE_NAF_2004", replace


// 2005-2019


foreach y of numlist 2005(1)2019 {
	global y `y'
	use siren nbheur s_brut catjur catjur_1 apet apen comt comr pcs pcs_1 filt age duree  ${condDADS} using "${tmp}/dads${y}", clear
	
	destring catjur*, replace
	bys siren (catjur): replace catjur = catjur[1]
	// drop Particuliers employeurs
	cap confirm string var siren
	if _rc == 0 {
		drop if inlist(substr(string(catjur),1,1),"1","2","3","5","6")  == 1 & substr(siren,1,1)=="P"
		drop if inlist(substr(string(catjur_1),1,1),"1","2","3","5","6")  == 1 & substr(siren,1,1)=="P"
	}
	keep if (floor(catjur/1000)==5) // keep private
	
	destring siren, replace force
	drop if mi(siren)
	
	replace apet = apen if mi(apet)
	drop apen
	
	if `y'<=2007 {
		ren apet naf_rev1
	merge m:1 siren comt using "${data}/Utils/SirenCom_NAF_rev12_dads", nogen keep(1 3) keepusing(naf_rev2)
	merge m:1 siren using "${data}/Utils/Siren_NAF_rev12", nogen keep(1 3 4 5) update  keepusing(naf_rev2)
	merge m:1 naf_rev1 using "${data}/Utils/NAF_rev12_impute", nogen keep(1 3 4 5) update  keepusing(naf_rev2)
		drop naf_rev1
		ren naf_rev2 apet
		}
	
	** Aggregate engineer empl. is much lower in the data in 2003 and 2004, try to correct for that
	if `y' == 2004 {
		replace pcs = pcs_1 if siren == 383475092 //error for Thales in 2004 data
	}
	if  `y' < 2005 {
		replace pcs = pcs_1 if substr(pcs_1,1,2) == "38" & substr(pcs,1,1) == "3" & substr(pcs,1,2) != "38"
	}
	
	gen hskill = substr(pcs,1,1)=="3"
	gen rdjob_eng = inlist(pcs,"383A","384A","385A","386A","388A")
	gen rdjob_tech = inlist(pcs,"473B","474B","475A","478A")
	gen job_eng = (substr(pcs,1,2) == "38")
	gen job_tech = (substr(pcs,1,2) == "47")
	gen job_hskill_noneng = (hskill==1 & job_tech==0 &  job_eng== 0 )
	gen other = (job_eng + job_tech + job_hskill_noneng == 0)

	
	foreach cat in job_eng job_tech job_hskill_noneng other rdjob_eng rdjob_tech {
		gen `cat'_hrs = nbheur *(`cat' == 1)
		gen `cat'_sbrut = s_brut *(`cat' == 1)
	}
			
	//allocate ZE
	gen depcom = comt
	replace depcom = comr if real(substr(depcom,1,2)) >= 98 | depcom == "69999" | regexm(depcom,"ZZZ")

	replace depcom = "75056" if substr(depcom,1,2) == "75"
	replace depcom = "13055" if inrange(real(depcom),13201,13216)
	replace depcom = "69123" if inrange(real(depcom),69381,69389)
	replace depcom = "59183" if depcom == "59540" | depcom == "59248"
	destring depcom, replace force
	drop if mi(depcom) // drops Corsica
	merge m:1 depcom using "${data}/Utils/depcom_ZE_mapping", nogen keep(1 3)
	//correct some inconsistencies
	drop if siren == 380129866 //Conversion France Telecom -> Orange, public status therefore absent before 2009
/*
	gen year = $y
	merge m:1 siren year using "$intpath/stateowned_toexclude", nogen keep(1) // exlude Orange and subsidiaries
	
*/
	replace apet = "2630Z" if siren == 480570597  & ZE2010 == "0056" //large LG electronics plant suddenly changes industry
	if ($y <= 2005) replace apet = "2651A" if siren == 383475092 //make sure Thales industry is right
	
	bys ZE2010 apet siren: gen nb_siren = _n == 1 
	bys ZE2010 apet siren: gen nb_siren_eng = (_n == 1) & job_eng > 0 & !mi(job_eng)

	bys ZE2010 apet: egen mean_duree_inge = sum(duree*job_eng)
	bys ZE2010 apet: egen foo = sum(job_eng)
	replace mean_duree_inge = mean_duree_inge / foo
	drop foo
	
	preserve
		keep siren catjur nbheur s_brut  apet job_eng ZE2010
		bys siren ZE2010 apet catjur: egen s_ing = sum(s_brut*job_eng)
		bys siren ZE2010 apet catjur: egen h_ing = sum(nbheur*job_eng)
		bys siren ZE2010 apet catjur: egen s_tot = sum(s_brut)
		bys siren ZE2010 apet catjur: egen h_tot = sum(nbh)
		gcollapse (mean) s_??? h_???, by(siren apet catjur ZE2010) fast
		gen year = ${y}		
		compress	
		save "${tmp}/DADSpri_RD_ZE_NAF_siren_${y}", replace
	restore
	
	preserve
	gcollapse (sum) *_hrs *_sbrut nb_siren* (mean) mean_duree_inge, by(depcom apet)
	ren apet ape	
	gen year = ${y}
	compress	
	save "${tmp}/DADSpri_RD_com_NAF_${y}", replace
	restore
	
	gcollapse (sum) *_hrs *_sbrut nb_siren* (mean) mean_duree_inge, by(ZE2010 apet)	
	ren apet ape	
	gen year = ${y}		
	compress	
	save "${tmp}/DADSpri_RD_ZE_NAF_${y}", replace
}


*** append at the ZE level

clear
foreach y of numlist 2004(1)2019 {
	append using "${tmp}/DADSpri_RD_ZE_NAF_`y'"
	}
compress	
save "${tmp}/DADSpri_RD_ZE_NAF_0419"	, replace


// balance the database
use "${tmp}/DADSpri_RD_ZE_NAF_0419"	, clear
	drop if mi(ZE) | mi(ape)
	egen o = group(ZE ape)
	xtset o year
	tsfill, full
	foreach var of varlist *hrs *sbrut {
		replace `var' = 0 if mi(`var')
		}
	bys o (ZE): replace ZE = ZE[_N] if ZE == ""
	bys o (ape): replace ape = ape[_N] if ape == ""
	drop o
save "${tmp}/DADSpri_RD_ZE_NAF_0419", replace



*** same at the municipality level

clear
foreach y of numlist 2004(1)2019 {
	append using "${tmp}/DADSpri_RD_com_NAF_`y'"
	}
compress	
save "${tmp}/DADSpri_RD_com_NAF_0419"	, replace


// balance the database
use "${tmp}/DADSpri_RD_com_NAF_0419"	, clear
	drop if mi(depcom) | mi(ape)
	egen o = group(depcom ape)
	xtset o year
	tsfill, full
	foreach var of varlist *hrs *sbrut {
		replace `var' = 0 if mi(`var')
		}
	bys o (depcom): replace depcom = depcom[_N] if depcom == .
	bys o (ape): replace ape = ape[_N] if ape == ""
	drop o
save "${tmp}/DADSpri_RD_com_NAF_0419", replace





// Engineers

foreach y of numlist 2005(1)2019 {
	global y `y'
	use siren nbheur s_brut catjur catjur_1 apet apen comt comr pcs pcs_1 filt age duree ${condDADS} using "${tmp}/dads${y}", clear
	
	destring catjur catjur_1, replace force

	bys siren (catjur): replace catjur = catjur[1]
	// drop Particuliers employeurs
	drop if inlist(substr(string(catjur),1,1),"1","2","3","5","6")  == 1 & substr(siren,1,1)=="P"
	drop if inlist(substr(string(catjur_1),1,1),"1","2","3","5","6")  == 1 & substr(siren,1,1)=="P"

	keep if (floor(catjur/1000)==5) // keep private
	
	destring siren, replace force
	drop if mi(siren)
	
	replace apet = apen if mi(apet)
	drop apen
	
	if `y'<=2007 {
		ren apet naf_rev1
		merge m:1 siren using "${data}/utils/Siren_NAF_rev12", nogen keep(1 3) keepusing(naf_rev2)
		merge m:1 naf_rev1 using "${data}/Utils/NAF_rev12_impute", nogen keep(1 3 4 5) update  keepusing(naf_rev2)
		drop naf_rev1
		ren naf_rev2 apet
		}
	
	gen job_eng = (substr(pcs,1,2) == "38")
	gen all = 1
	foreach cat in job_eng all {
		gen `cat'_hrs = nbheur *(`cat' == 1)
		gen `cat'_sbrut = s_brut *(`cat' == 1)
		}
			
	//allocate ZE
	gen depcom = comt
	replace depcom = comr if real(substr(depcom,1,2)) >= 98 | depcom == "69999" | regexm(depcom,"ZZZ")

	replace depcom = "75056" if substr(depcom,1,2) == "75"
	replace depcom = "13055" if inrange(real(depcom),13201,13216)
	replace depcom = "69123" if inrange(real(depcom),69381,69389)
	replace depcom = "59183" if depcom == "59540" | depcom == "59248"
	destring depcom, replace force
	drop if mi(depcom) // drops Corsica
	merge m:1 depcom using "${data}/utils/depcom_ZE_mapping", nogen keep(1 3)
	
	
	gcollapse (sum) *_hrs *_sbrut, by(siren ZE2010 apet) fast
	
	ren apet ape
	
	gen year = ${y}
		
	save "${tmp}/DADSpri_engineers_ZE_NAF_${y}", replace
}

cd "${tmp}"
fs DADSpri_engineers_ZE_NAF_*.dta
clear
append using `r(files)'
gduplicates drop
compress
save "${tmp}/SirenZE_engineers_naf", replace


